{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/wVN12smEvqg?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#@title\n",
    "from IPython.display import HTML\n",
    "\n",
    "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/wVN12smEvqg?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers and Datasets libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install datasets transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'label': 'POSITIVE', 'score': 0.9598047137260437},\n",
       " {'label': 'NEGATIVE', 'score': 0.9994558095932007}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "classifier = pipeline(\"sentiment-analysis\")\n",
    "classifier([\n",
    "    \"I've been waiting for a HuggingFace course my whole life.\", \n",
    "    \"I hate this so much!\"\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=\n",
      "array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,\n",
      "        12172,  2607,  2026,  2878,  2166,  1012,   102],\n",
      "       [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,\n",
      "            0,     0,     0,     0,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=\n",
      "array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
      "       [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "raw_inputs = [\n",
    "    \"I've been waiting for a HuggingFace course my whole life.\", \n",
    "    \"I hate this so much!\",\n",
    "]\n",
    "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors=\"tf\")\n",
    "print(inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['pre_classifier', 'classifier', 'dropout_19']\n",
      "- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2, 16, 768)\n"
     ]
    }
   ],
   "source": [
    "from transformers import TFAutoModel\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "model = TFAutoModel.from_pretrained(checkpoint)\n",
    "outputs = model(inputs)\n",
    "print(outputs.last_hidden_state.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']\n",
      "- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_57']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[[-1.5606967  1.6122819]\n",
      " [ 4.169231  -3.3464472]], shape=(2, 2), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "from transformers import TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "outputs = model(inputs)\n",
    "print(outputs.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[[4.0195346e-02 9.5980465e-01]\n",
      " [9.9945587e-01 5.4418424e-04]], shape=(2, 2), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "predictions = tf.math.softmax(outputs.logits, axis=-1)\n",
    "print(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'NEGATIVE', 1: 'POSITIVE'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.id2label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "name": "What happens inside the pipeline function? (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
